library(car)
library(mosaic)
library(DT)
library(tidyverse)

Read in the Data

files <- dir()
rdat <- read.csv(grep(".csv", files, value=TRUE), header=TRUE)
as.tibble(rdat)
# A tibble: 155 x 11
         Y     X1    X2    X3    X4    X5    X6     X7    X8    X9    X10
     <dbl>  <dbl> <int> <int> <int> <int> <int>  <dbl> <dbl> <dbl>  <dbl>
 1  -6.11   2.17      1     1     0     1     1 3.00    6.72 42.2  0.223 
 2  -5.66   2.17      1     1     1     0     0 0.799   8.38 30.0  0.337 
 3  -0.248 -0.333     0     0     1     0     1 2.69   25.1   2.34 0.429 
 4 -11.2   -0.857     1     0     1     1     1 1.31   11.7   6.81 0.626 
 5 -10.6    1.03      1     1     1     0     1 2.64   26.5  32.1  1.26  
 6  -2.81   1.19      1     0     1     0     1 3.15   32.4  28.5  0.422 
 7  -1.86   1.38      0     0     1     0     0 1.50   10.5   8.99 1.30  
 8 -17.8   -0.990     1     1     1     1     1 2.02   21.2  18.6  1.01  
 9  -8.57  -0.875     0     1     1     0     0 1.16   10.4  10.7  1.14  
10   2.13   1.09      1     0     0     0     1 0.0908  1.47 27.3  0.0564
# ... with 145 more rows

Create first Pairs Plot

pairs(rdat, pch=16, cex=1, panel=panel.smooth, col=rgb(.8,.8,.8))

Start with X4 and X5

lm1 <- lm(Y ~ X4 + X5, data=rdat)
summary(lm1)
## 
## Call:
## lm(formula = Y ~ X4 + X5, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6441 -2.5074  0.0913  2.1465 13.7465 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.2437     0.6044   3.712 0.000288 ***
## X4           -5.9592     0.6645  -8.968 1.04e-15 ***
## X5           -5.4109     0.6664  -8.119 1.49e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.11 on 152 degrees of freedom
## Multiple R-squared:  0.5059, Adjusted R-squared:  0.4994 
## F-statistic: 77.81 on 2 and 152 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple"))
pairs(cbind(R=lm1$res, Fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=interaction(rdat$X4,rdat$X5))

Add X3, X10 suspicious

I also worry about how tightly X7 and X8 are linked to the X4 and X5 variables.

lm2 <- lm(Y ~ X3 + X4 + X5, data=rdat)
summary(lm2)
## 
## Call:
## lm(formula = Y ~ X3 + X4 + X5, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0816 -1.9313 -0.4369  1.4425 15.3090 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.0526     0.6632   6.110 8.08e-09 ***
## X3           -3.1322     0.6148  -5.095 1.03e-06 ***
## X4           -6.1623     0.6172  -9.985  < 2e-16 ***
## X5           -5.4469     0.6177  -8.818 2.63e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.809 on 151 degrees of freedom
## Multiple R-squared:  0.5784, Adjusted R-squared:   0.57 
## F-statistic: 69.04 on 3 and 151 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","blue","red","green4","black"))
pairs(cbind(R=lm2$res, Fit=lm2$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=interaction(rdat$X4,rdat$X5,rdat$X3))

Zoom in on X10

plot(Y ~ X10, data=rdat, col=interaction(X3,X4,X5))

Looks like we need X2

lm3 <- lm(Y ~ X2 + X3 + X4 + X5, data=rdat)
summary(lm3)
## 
## Call:
## lm(formula = Y ~ X2 + X3 + X4 + X5, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4498 -1.9356 -0.5587  1.2835 15.1969 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.7692     0.7193   6.630 5.69e-10 ***
## X2           -1.4467     0.6081  -2.379   0.0186 *  
## X3           -3.2842     0.6089  -5.394 2.63e-07 ***
## X4           -6.0713     0.6090  -9.968  < 2e-16 ***
## X5           -5.3872     0.6089  -8.848 2.30e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.751 on 150 degrees of freedom
## Multiple R-squared:  0.5937, Adjusted R-squared:  0.5829 
## F-statistic:  54.8 on 4 and 150 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","green","purple","blue","red","green4","black"))
pairs(cbind(R=lm3$res, Fit=lm3$fit, rdat), pch=16, cex=1, panel=panel.smooth, col=interaction(rdat$X4,rdat$X5,rdat$X3,rdat$X2))

Zoom in on X10

plot(Y ~ X10, data=rdat, col=interaction(X2,X3,X4,X5))

Capture the parabolas

lm4 <- lm(Y ~ X2 + X3 + X4 + X5 + X10 + X10:X2 + X10:X3 + X10:X4 + X10:X5 + I(X10^2) + I(X10^2):X2 + I(X10^2):X3 + I(X10^2):X4 + I(X10^2):X5, data=rdat)
summary(lm4)
## 
## Call:
## lm(formula = Y ~ X2 + X3 + X4 + X5 + X10 + X10:X2 + X10:X3 + 
##     X10:X4 + X10:X5 + I(X10^2) + I(X10^2):X2 + I(X10^2):X3 + 
##     I(X10^2):X4 + I(X10^2):X5, data = rdat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.314895 -0.056741 -0.003001  0.062851  0.217688 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.00720    0.03052  131.28   <2e-16 ***
## X2          -1.99237    0.02563  -77.73   <2e-16 ***
## X3          -0.85955    0.02671  -32.18   <2e-16 ***
## X4          -2.99079    0.02806 -106.57   <2e-16 ***
## X5          -5.13591    0.02786 -184.37   <2e-16 ***
## X10          5.12426    0.09374   54.67   <2e-16 ***
## I(X10^2)    -1.12782    0.07143  -15.79   <2e-16 ***
## X2:X10      -2.17083    0.07727  -28.09   <2e-16 ***
## X3:X10      -6.70508    0.08239  -81.38   <2e-16 ***
## X4:X10      -8.54898    0.08566  -99.80   <2e-16 ***
## X5:X10      -7.37333    0.08453  -87.23   <2e-16 ***
## X2:I(X10^2)  2.12915    0.06090   34.96   <2e-16 ***
## X3:I(X10^2)  1.08973    0.06354   17.15   <2e-16 ***
## X4:I(X10^2)  2.05988    0.06527   31.56   <2e-16 ***
## X5:I(X10^2)  4.66846    0.06412   72.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1061 on 140 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 3.295e+04 on 14 and 140 DF,  p-value: < 2.2e-16
plot(Y ~ X10, data=rdat, col=interaction(X2,X3,X4,X5))
points(lm4$fitted ~ X10, data=rdat, col=interaction(X2,X3,X4,X5), cex=0.5, pch=16)

Looks like we got it! Now to draw all the curves. (Notice all of the fitted value almost exactly on the center of each dot in the plot above.)

plot(Y ~ X10, data=rdat, col=interaction(X2,X3,X4,X5))
b <- coef(lm4)

X2 = 0; X3 = 0; X4 = 0; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[1])

X2 = 1; X3 = 0; X4 = 0; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[2])

X2 = 0; X3 = 1; X4 = 0; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[3])

X2 = 0; X3 = 0; X4 = 1; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[5])

X2 = 0; X3 = 0; X4 = 0; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[1])


X2 = 1; X3 = 1; X4 = 0; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[4])


X2 = 1; X3 = 0; X4 = 1; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[6])


X2 = 1; X3 = 0; X4 = 0; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[2])


X2 = 0; X3 = 1; X4 = 0; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[3])

X2 = 0; X3 = 1; X4 = 1; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[7])

X2 = 0; X3 = 0; X4 = 1; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[5])


X2 = 1; X3 = 1; X4 = 1; X5 = 0;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[8])


X2 = 1; X3 = 0; X4 = 1; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[6])


X2 = 1; X3 = 1; X4 = 0; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[4])


X2 = 0; X3 = 1; X4 = 1; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[7])

X2 = 1; X3 = 1; X4 = 1; X5 = 1;
curve(b[1] + b[2]*X2 + b[3]*X3 + b[4]*X4 + b[5]*X5 + b[6]*x + b[7]*x^2 + b[8]*x*X2 + b[9]*x*X3 + b[10]*x*X4 + b[11]*x*X5 + b[12]*x^2*X2 + b[13]*x^2*X3 + b[14]*x^2*X4 + b[15]*x^2*X5, add=TRUE, col=palette()[8])

Easy way to draw it…

ggplot(rdat, aes(x=X10, y=Y, col=interaction(X2,X3,X4,X5))) + 
  geom_point() +
  geom_smooth(method="lm", se=F, formula=y ~ poly(x, 2, raw=TRUE) )